/*
 * Copyright (c) 2011-2013, 2018-2019 Genome Research Ltd.
 * Author(s): James Bonfield
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *    2. Redistributions in binary form must reproduce the above
 *       copyright notice, this list of conditions and the following
 *       disclaimer in the documentation and/or other materials provided
 *       with the distribution.
 *
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
 *       Institute nor the names of its contributors may be used to endorse
 *       or promote products derived from this software without specific
 *       prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// We use generic maps to turn 0-M into 0-N where N <= M
// before adding these into the context.  These are used
// for positions, running-diffs and quality values.
//
// This can be used as a simple divisor, eg pos/24 to get
// 2 bits of positional data for each quarter along a 100bp
// read, or it can be tailored for specific such as noting
// the first 5 cycles are poor, then we have stability and
// a gradual drop off in the last 20 or so.  Perhaps we then
// map pos 0-4=0, 5-79=1, 80-89=2, 90-99=3.
//
// We don't need to specify how many bits of data we are
// using (2 in the above example), as that is just implicit
// in the values in the map.  Specify not to use a map simply
// disables that context type (our map is essentially 0-M -> 0).

// Example of command line usage:
//
// f=~/scratch/data/q4
// cc -Wall -DTEST_MAIN -O3 -g fqzcomp_qual2.c -lm
// ./a.out $f > /tmp/_ && ./a.out -d < /tmp/_ > /tmp/__ && cmp /tmp/__ $f

#include "config.h"

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <math.h>
#include <limits.h>
#include <ctype.h>
#include <math.h>
#include <inttypes.h>
#include <sys/types.h>

//#define NO_THREADS
#ifndef NO_THREADS
#include <pthread.h>
#endif

#include "fqzcomp_qual.h"
#include "varint.h"

#define CTX_BITS 16
#define CTX_SIZE (1<<CTX_BITS)

#ifndef MIN
#  define MIN(a,b) ((a)<(b)?(a):(b))
#  define MAX(a,b) ((a)>(b)?(a):(b))
#endif

#define QMAX 256
#define QBITS 12
#define QSIZE (1<<QBITS)

#define NSYM 2
#include "c_simple_model.h"

#undef NSYM
#define NSYM QMAX
//#include "c_escape_model.h"
#include "c_simple_model.h"
//#include "c_cdf_model.h"
//#include "c_cdf16_model.h"

// An array of 0,0,0, 1,1,1,1, 3, 5,5
// is turned into a run-length of 3x0, 4x1, 0x2, 1x4, 0x4, 2x5,
// which then becomes 3 4 0 1 0 2.
//
// NB: size > 255 therefore means we need to repeatedly read to find
// the actual run length.
// Alternatively we could bit-encode instead of byte encode, eg BETA.
static int store_array(unsigned char *out, unsigned int *array, int size) {
    unsigned char tmp[2048];

    int i, j, k;
    for (i = j = k = 0; i < size; j++) {
	int run_len = i;
	while (i < size && array[i] == j)
	    i++;
	run_len = i-run_len;

	int r;
	do {
	    r = MIN(255, run_len);
	    tmp[k++] = r;
	    run_len -= r;
	} while (r == 255);
    }
    while (i < size)
	tmp[k++] = 0, j++;

    // RLE on out.
    //    1 2 3 3 3 3 3 4 4    5
    // => 1 2 3 3 +3... 4 4 +0 5
    int last = -1;
    for (i = j = 0; j < k; i++) {
	out[i] = tmp[j++];
	if (out[i] == last) {
	    int n = j;
	    while (j < k && tmp[j] == last)
		j++;
	    out[++i] = j-n;
	} else {
	    last = out[i];
	}
    }
    k = i;

//    fprintf(stderr, "Store_array %d => %d {", size, k);
//    for (i = 0; i < k; i++)
//	fprintf(stderr, "%d,", out[i]);
//    fprintf(stderr, "}\n");
    return k;
}

static int read_array(unsigned char *in, size_t in_size, unsigned int *array, int size) {
    unsigned char R[1024];
    int i, j, z, last = -1, nb = 0;

    size = MIN(1024, size);

    // Remove level one of run-len encoding
    for (i = j = z = 0; z < size && i < in_size; i++) {
	int run = in[i];
	R[j++] = run;
	z += run;
	if (run == last) {
	    if (i+1 >= in_size)
		return -1;
	    int copy = in[++i];
	    z += run * copy;
	    while (copy-- && z < size && j < 1024)
		R[j++] = run;
	}
	if (j >= 1024)
	    return -1;
	last = run;
    }
    nb = i;

    // Now expand inner level of run-length encoding
    int R_max = j;
    for (i = j = z = 0; j < size; i++) {
	int run_len = 0;
	int run_part;
	if (z >= R_max)
	    return -1;
	do {
	    run_part = R[z++];
	    run_len += run_part;
	} while (run_part == 255 && z < R_max);
	if (run_part == 255)
	    return -1;

	while (run_len && j < size)
	    run_len--, array[j++] = i;
    }

    return nb;
}

// FIXME: how to auto-tune these rather than trial and error?
// r2 = READ2
// qa = qual avg (0, 2, 4)
static int strat_opts[][12] = {
//   qb  qs pb ps db ds ql sl pl  dl  r2 qa
    {10, 5, 4,-1, 2, 1, 0, 14, 10, 14, 0,-1}, // basic options (level < 7)
    {8,  5, 7, 0, 0, 0, 0, 14, 8,  14, 1,-1}, // e.g. HiSeq 2000
    {12, 6, 2, 0, 2, 3, 0, 9,  12, 14, 0, 0}, // e.g. MiSeq
    {12, 6, 0, 0, 0, 0, 0, 12, 0,  0,  0, 0}, // e.g. IonTorrent; adaptive O1
    {0,  0, 0, 0, 0, 0, 0, 0,  0,  0,  0, 0}, // custom
};
static int nstrats = sizeof(strat_opts) / sizeof(*strat_opts);

#ifdef __SSE__
#   include <xmmintrin.h>
#else
#   define _mm_prefetch(a,b)
#endif

typedef struct {
    unsigned int qctx;  // quality sub-context
    unsigned int p;     // pos (bytes remaining)
    unsigned int add_d; // whether to update delta (skip first cycle)
    unsigned int delta; // delta running total
    unsigned int prevq; // previous quality
    unsigned int s;     // selector
    unsigned int qtot, qlen;
    unsigned int first_len;
} fqz_state;

static void dump_table(unsigned int *tab, int size, char *name) {
    int i, last = -99, run = 0;
    fprintf(stderr, "\t%s\t{", name);
    for (i = 0; i < size; i++) {
	if (tab[i] == last) {
	    run++;
	} else if (run == 1 && tab[i] == last+1) {
	    int first = last;
	    do {
		last = tab[i];
		i++;
	    } while (i < size && tab[i] == last+1);
	    i--;

	    // Want 0,1,2,3,3,3 as 0..2 3x3, not 0..3 3x2
	    if (tab[i] == tab[i+1])
		i--;
	    if (tab[i] != first)
		fprintf(stderr, "..%d", tab[i]);
	    run = 1;
	    last = -99;
	} else {
	    if (run > 1)
		fprintf(stderr, " x %d%s%d", run, i?", ":"", tab[i]);
	    else
		fprintf(stderr, "%s%d", i?", ":"", tab[i]);
	    run = 1;
	    last = tab[i];
	}
    }
    if (run > 1)
	fprintf(stderr, " x %d", run);
    fprintf(stderr, "}\n");
}

static void dump_map(unsigned int *map, int size, char *name) {
    int i, c = 0;
    fprintf(stderr, "\t%s\t{", name);
    for (i = 0; i < size; i++)
	if (map[i] != INT_MAX)
	    fprintf(stderr, "%s%d=%d", c++?", ":"", i, map[i]);
    fprintf(stderr, "}\n");
}

#pragma GCC diagnostic ignored "-Wunused-function"
static void dump_params(fqz_gparams *gp) {
    fprintf(stderr, "Global params = {\n");
    fprintf(stderr, "\tvers\t%d\n", gp->vers);
    fprintf(stderr, "\tgflags\t0x%02x\n", gp->gflags);
    fprintf(stderr, "\tnparam\t%d\n", gp->nparam);
    fprintf(stderr, "\tmax_sel\t%d\n", gp->max_sel);
    fprintf(stderr, "\tmax_sym\t%d\n", gp->max_sym);
    if (gp->gflags & GFLAG_HAVE_STAB)
	dump_table(gp->stab, 256, "stab");
    fprintf(stderr, "}\n");

    int i;
    for (i = 0; i < gp->nparam; i++) {
	fqz_param *pm = &gp->p[i];
	fprintf(stderr, "\nParam[%d] = {\n", i);
	fprintf(stderr, "\tcontext\t0x%04x\n", pm->context);
	fprintf(stderr, "\tpflags\t0x%02x\n",  pm->pflags);
	fprintf(stderr, "\tmax_sym\t%d\n",  pm->max_sym);
	fprintf(stderr, "\tqbits\t%d\n",   pm->qbits);
	fprintf(stderr, "\tqshift\t%d\n",  pm->qshift);
	fprintf(stderr, "\tqloc\t%d\n",    pm->qloc);
	fprintf(stderr, "\tsloc\t%d\n",    pm->sloc);
	fprintf(stderr, "\tploc\t%d\n",    pm->ploc);
	fprintf(stderr, "\tdloc\t%d\n",    pm->dloc);

	if (pm->pflags & PFLAG_HAVE_QMAP)
	    dump_map(pm->qmap, 256, "qmap");

	if (pm->pflags & PFLAG_HAVE_QTAB)
	    dump_table(pm->qtab, 256, "qtab");
	if (pm->pflags & PFLAG_HAVE_PTAB)
	    dump_table(pm->ptab, 1024, "ptab");
	if (pm->pflags & PFLAG_HAVE_DTAB)
	    dump_table(pm->dtab, 256, "dtab");
	fprintf(stderr, "}\n");
    }
}

typedef struct {
    SIMPLE_MODEL(QMAX,_) *qual;
    SIMPLE_MODEL(256,_)   len[4];
    SIMPLE_MODEL(2,_)     revcomp;
    SIMPLE_MODEL(256,_)   sel;
    SIMPLE_MODEL(2,_)     dup;
} fqz_model;

#ifndef NO_THREADS
/*
 * Thread local storage, used to avoid repeated malloc/free calls.
 */
pthread_once_t fqz_once = PTHREAD_ONCE_INIT;
pthread_key_t fqz_key;

static void fqz_tls_init(void) {
    pthread_key_create(&fqz_key, free);
}
#endif

static int fqz_create_models(fqz_model *m, fqz_gparams *gp) {
    int i;

#ifndef NO_THREADS
    pthread_once(&fqz_once, fqz_tls_init);

    m->qual = pthread_getspecific(fqz_key);

    if (!m->qual) {
        if (!(m->qual = malloc(sizeof(*m->qual) * CTX_SIZE)))
	    return -1;
	pthread_setspecific(fqz_key, m->qual);
    }
#else
    if (!(m->qual = malloc(sizeof(*m->qual) * CTX_SIZE)))
	return -1;
#endif

    for (i = 0; i < CTX_SIZE; i++)
	SIMPLE_MODEL(QMAX,_init)(&m->qual[i], gp->max_sym+1);

    for (i = 0; i < 4; i++)
	SIMPLE_MODEL(256,_init)(&m->len[i],256);

    SIMPLE_MODEL(2,_init)(&m->revcomp,2);
    SIMPLE_MODEL(2,_init)(&m->dup,2);
    if (gp->max_sel > 0)
	SIMPLE_MODEL(256,_init)(&m->sel, gp->max_sel+1);

    return 0;
}

static void fqz_destroy_models(fqz_model *m) {
#ifdef NO_THREADS
    free(m->qual);
#endif
}

static inline unsigned int fqz_update_ctx(fqz_param *pm, fqz_state *state, int q) {
    unsigned int last = 0; // pm->context
    state->qctx = (state->qctx << pm->qshift) + pm->qtab[q];
    last += (state->qctx & pm->qmask) << pm->qloc;

    // The final shifts have been factored into the tables already.
    last += pm->ptab[MIN(1023, state->p)];      // << pm->ploc
    last += pm->dtab[MIN(255,  state->delta)];  // << pm->dloc
    last += state->s << pm->sloc;

    // On the fly average is slow work.
    // However it can be slightly better than using a selector bit
    // as it's something we can compute on the fly and thus doesn't
    // consume output bits for storing the selector itself.
    //
    // Q4 (novaseq.bam)
    // qtot+=q*q -DQ1=8.84 -DQ2=8.51 -DQ3=7.70; 7203598 (-0.7%)
    // qtot+=q   -DQ1=2.96 -DQ2=2.85 -DQ3=2.69; 7207315
    // vs old delta;                            7255614 (default params)
    // vs 2 bit selector (no delta)             7203006 (-x 0x8261000e80)
    // vs 2 bit selector (no delta)             7199153 (-x 0x7270000e70) -0.8%
    // vs 2 bit selector (no delta)             7219668 (-x 0xa243000ea0)
    //{
    //	double qa = state->qtot / (state->qlen+.01);
    //	//fprintf(stderr, "%f\n", qa);
    //	int x = 0;
    //	if (qa>=Q1) x=3;
    //	else if (qa>=Q2) x=2;
    //	else if (qa>=Q3) x=1;
    //	else x=0;
    //	last += x << pm->dloc; // tmp reuse of delta pos
    //  state->qtot += q*q;
    //  state->qlen++;
    //}

    // Only update delta after 1st base.
    //state->delta += state->add_d * (state->prevq != q);
    //state->add_d = 1;
    state->delta += (state->prevq != q);
    state->prevq = q;

    state->p--;

    return last & (CTX_SIZE-1);
}


// Build quality stats for qhist and set nsym, do_dedup and do_sel params.
// One_param is -1 to gather stats on all data, or >= 0 to gather data
// on one specific selector parameter.  Used only in TEST_MAIN via
// fqz_manual_parameters at the moment.
void fqz_qual_stats(fqz_slice *s,
		    unsigned char *in, size_t in_size,
		    fqz_param *pm,
		    uint32_t qhist[256],
		    int one_param) {
#define NP 128
    uint32_t qhistb[NP][256] = {{0}};  // both
    uint32_t qhist1[NP][256] = {{0}};  // READ1 only
    uint32_t qhist2[NP][256] = {{0}};  // READ2 only
    uint64_t t1[NP] = {0};             // Count for READ1
    uint64_t t2[NP] = {0};             // COUNT for READ2
    uint32_t avg[2560] = {0};          // Avg qual *and later* avg-to-selector map.

    int dir = 0;
    int last_len = 0;
    int do_dedup = 0;
    size_t rec;
    size_t i, j;
    int num_rec = 0;

    // See what info we've been given.
    // Do we have READ1 / READ2?
    // Do we have selector hidden in the top bits of flag?
    int max_sel = 0;
    int has_r2 = 0;
    for (rec = 0; rec < s->num_records; rec++) {
	if (one_param >= 0 && (s->flags[rec] >> 16) != one_param)
	    continue;
	num_rec++;
	if (max_sel < (s->flags[rec] >> 16))
	    max_sel = (s->flags[rec] >> 16);
	if (s->flags[rec] & FQZ_FREAD2)
	    has_r2 = 1;
    }

    // Dedup detection and histogram stats gathering
    int *avg_qual = calloc((s->num_records+1), sizeof(int));
    if (!avg_qual)
	return;

    rec = i = j = 0;
    while (i < in_size) {
	if (one_param >= 0 && (s->flags[rec] >> 16) != one_param) {
	    avg_qual[rec] = 0;
	    i += s->len[rec++];
	    continue;
	}
	if (rec < s->num_records) {
	    j = s->len[rec];
	    dir = s->flags[rec] & FQZ_FREAD2 ? 1 : 0;
	    if (i > 0 && j == last_len
		&& !memcmp(in+i-last_len, in+i, j))
		do_dedup++; // cache which records are dup?
	} else {
	    j = in_size - i;
	    dir = 0;
	}
	last_len = j;

	uint32_t (*qh)[256] = dir ? qhist2 : qhist1;
	uint64_t *th        = dir ? t2     : t1;

	uint32_t tot = 0;
	for (; i < in_size && j > 0; i++, j--) {
	    tot += in[i];
	    qhist[in[i]]++;
	    qhistb[j & (NP-1)][in[i]]++;
	    qh[j & (NP-1)][in[i]]++;
	    th[j & (NP-1)]++;
	}
	tot = last_len ? (tot*10.0)/last_len+.5 : 0;

	avg_qual[rec] = tot;
	avg[MIN(2559, tot)]++;

	rec++;
    }
    pm->do_dedup = ((rec+1)/(do_dedup+1) < 500);

    last_len = 0;

    // Unique symbol count
    for (i = pm->max_sym = pm->nsym = 0; i < 256; i++) {
	if (qhist[i])
	    pm->max_sym = i, pm->nsym++;
    }


    // Auto tune: does average quality helps us?
    if (pm->do_qa != 0) {
	// Histogram of average qual in avg[]
	// NB: we convert avg[] from count to selector index

	// Few symbols means high compression which means
	// selector bits become more significant fraction.
	// Reduce selector bits by skewing the distribution
	// to not be even binning.
	double qf0 = pm->nsym > 8 ? 0.2 : 0.05;
	double qf1 = pm->nsym > 8 ? 0.5 : 0.22;
	double qf2 = pm->nsym > 8 ? 0.8 : 0.60;

	int total = 0;
	i = 0;
	while (i < 2560) {
	    total += avg[i];
	    if (total > qf0 * num_rec) {
		//fprintf(stderr, "Q1=%d\n", (int)i);
		break;
	    }
	    avg[i++] = 0;
	}
	while (i < 2560) {
	    total += avg[i];
	    if (total > qf1 * num_rec) {
		//fprintf(stderr, "Q2=%d\n", (int)i);
		break;
	    }
	    avg[i++] = 1;
	}
	while (i < 2560) {
	    total += avg[i];
	    if (total > qf2 * num_rec) {
		//fprintf(stderr, "Q3=%d\n", (int)i);
		break;
	    }
	    avg[i++] = 2;
	}
	while (i < 2560)
	    avg[i++] = 3;

	// Compute simple entropy of merged signal vs split signal.
        i = 0;
	rec = 0;

	int qbin4[4][NP][256] = {{{0}}};
	int qbin2[2][NP][256] = {{{0}}};
	int qbin1   [NP][256] = {{0}};
	int qcnt4[4][NP] = {{0}};
	int qcnt2[4][NP] = {{0}};
	int qcnt1   [NP] = {0};
        while (i < in_size) {
	    if (one_param >= 0 && (s->flags[rec] >> 16) != one_param) {
		i += s->len[rec++];
		continue;
	    }
	    if (rec < s->num_records)
		j = s->len[rec];
	    else
		j = in_size - i;
	    last_len = j;

	    uint32_t tot = avg_qual[rec];
	    int qb4 = avg[MIN(2559, tot)];
	    int qb2 = qb4/2;

	    for (; i < in_size && j > 0; i++, j--) {
		int x = j & (NP-1);
		qbin4[qb4][x][in[i]]++;  qcnt4[qb4][x]++;
		qbin2[qb2][x][in[i]]++;  qcnt2[qb2][x]++;
		qbin1     [x][in[i]]++;  qcnt1     [x]++;
	    }
	    rec++;
	}

	double e1 = 0, e2 = 0, e4 = 0;
	for (j = 0; j < NP; j++) {
	    for (i = 0; i < 256; i++) {
		if (qbin1   [j][i]) e1 += qbin1   [j][i] * log(qbin1   [j][i] / (double)qcnt1   [j]);
		if (qbin2[0][j][i]) e2 += qbin2[0][j][i] * log(qbin2[0][j][i] / (double)qcnt2[0][j]);
		if (qbin2[1][j][i]) e2 += qbin2[1][j][i] * log(qbin2[1][j][i] / (double)qcnt2[1][j]);
		if (qbin4[0][j][i]) e4 += qbin4[0][j][i] * log(qbin4[0][j][i] / (double)qcnt4[0][j]);
		if (qbin4[1][j][i]) e4 += qbin4[1][j][i] * log(qbin4[1][j][i] / (double)qcnt4[1][j]);
		if (qbin4[2][j][i]) e4 += qbin4[2][j][i] * log(qbin4[2][j][i] / (double)qcnt4[2][j]);
		if (qbin4[3][j][i]) e4 += qbin4[3][j][i] * log(qbin4[3][j][i] / (double)qcnt4[3][j]);
	    }
	}
	e1 /= -log(2)/8;
	e2 /= -log(2)/8;
	e4 /= -log(2)/8;
	//fprintf(stderr, "E1=%f E2=%f E4=%f\n", e1, e2+s->num_records/8, e4+s->num_records/4);

	// Note by using the selector we're robbing bits from elsewhere in
	// the context, which may reduce compression better.
	// We don't know how much by, so this is basically a guess!
	// For now we just say need 5% saving here.
	double qm = pm->do_qa > 0 ? 1 : 0.98;
	if ((pm->do_qa == -1 || pm->do_qa >= 4) &&
	    e4 + s->num_records/4 < e2*qm + s->num_records/8 &&
	    e4 + s->num_records/4 < e1*qm) {
	    //fprintf(stderr, "do q4\n");
	    for (i = 0; i < s->num_records; i++) {
		//fprintf(stderr, "%d -> %d -> %d, %d\n", (int)i, avg_qual[i], avg[MIN(2559, avg_qual[i])], s->flags[i]>>16);
		s->flags[i] |= avg[MIN(2559, avg_qual[i])] <<16;
	    }
	    pm->do_sel = 1;
	    max_sel = 3;
	} else if ((pm->do_qa == -1 || pm->do_qa >= 2) && e2 + s->num_records/8 < e1*qm) {
	    //fprintf(stderr, "do q2\n");
	    for (i = 0; i < s->num_records; i++)
		s->flags[i] |= (avg[MIN(2559, avg_qual[i])]>>1) <<16;
	    pm->do_sel = 1;
	    max_sel = 1;
	}

	if (pm->do_qa == -1) {
	    // assume qual, pos, delta in that order.
	    if (pm->pbits > 0 && pm->dbits > 0) {
		// 1 from pos/delta
		pm->sloc = pm->dloc-1;
		pm->pbits--;
		pm->dbits--;
		pm->dloc++;
	    } else if (pm->dbits >= 2) {
		// 2 from delta
		pm->sloc = pm->dloc;
		pm->dbits -= 2;
		pm->dloc += 2;
	    } else if (pm->qbits >= 2) {
		pm->qbits -= 2;
		pm->ploc -= 2;
		pm->sloc = 16-2 - pm->do_r2;
		if (pm->qbits == 6 && pm->qshift == 5)
		    pm->qbits--;
	    }
	    pm->do_qa = 4;
	}
    }

    // Auto tune: does splitting up READ1 and READ2 help us?
    if (has_r2 || pm->do_r2) { // FIXME: && but debug for now
	double e1 = 0, e2 = 0; // entropy sum

	for (j = 0; j < NP; j++) {
	    if (!t1[j] || !t2[j]) continue;
	    for (i = 0; i < 256; i++) {
		if (!qhistb[j][i]) continue;
		e1 -= (qhistb[j][i])*log(qhistb[j][i] / (double)(t1[j]+t2[j]));
		if (qhist1[j][i])
		    e2 -= qhist1[j][i] * log(qhist1[j][i] / (double)t1[j]);
		if (qhist2[j][i])
		    e2 -= qhist2[j][i] * log(qhist2[j][i] / (double)t2[j]);
	    }
	}
	e1 /= log(2)*8; // bytes
	e2 /= log(2)*8;

	//fprintf(stderr, "read1/2 entropy merge %f split %f\n", e1, e2);

	// Note by using the selector we're robbing bits from elsewhere in
	// the context, which may reduce compression better.
	// We don't know how much by, so this is basically a guess!
	// For now we just say need 5% saving here.
	double qm = pm->do_r2 > 0 ? 1 : 0.95;
	if (e2 + (8+s->num_records/8) < e1*qm) {
	    for (rec = 0; rec < s->num_records; rec++) {
		if (one_param >= 0 && (s->flags[rec] >> 16) != one_param)
		    continue;
		int sel = s->flags[rec] >> 16;
		s->flags[rec] =  (s->flags[rec] & 0xffff)
		    | ((s->flags[rec] & FQZ_FREAD2)
		       ? ((sel*2)+1) << 16
		       : ((sel*2)+0) << 16);
		if (max_sel < (s->flags[rec]>>16))
		    max_sel = (s->flags[rec]>>16);
	    }
	}
    }

    // We provided explicit selector data or auto-tuned it
    if (max_sel > 0) {
	pm->do_sel = 1;
	pm->max_sel = max_sel;
    }

    free(avg_qual);
}

static inline
int fqz_store_parameters1(fqz_param *pm, unsigned char *comp) {
    int comp_idx = 0, i, j;

    // Starting context
    comp[comp_idx++] = pm->context;
    comp[comp_idx++] = pm->context >> 8;

    comp[comp_idx++] = pm->pflags;
    comp[comp_idx++] = pm->max_sym;

    comp[comp_idx++] = (pm->qbits<<4)|pm->qshift;
    comp[comp_idx++] = (pm->qloc<<4)|pm->sloc;
    comp[comp_idx++] = (pm->ploc<<4)|pm->dloc;

    if (pm->store_qmap) {
	for (i = j = 0; i < 256; i++)
	    if (pm->qmap[i] != INT_MAX)
		comp[comp_idx++] = i;
    }

    if (pm->qbits && pm->use_qtab)
	// custom qtab
	comp_idx += store_array(comp+comp_idx, pm->qtab, 256);

    if (pm->pbits && pm->use_ptab)
	// custom ptab
	comp_idx += store_array(comp+comp_idx, pm->ptab, 1024);

    if (pm->dbits && pm->use_dtab)
	// custom dtab
	comp_idx += store_array(comp+comp_idx, pm->dtab, 256);

    return comp_idx;
}

static 
int fqz_store_parameters(fqz_gparams *gp, unsigned char *comp) {
    int comp_idx = 0;
    comp[comp_idx++] = gp->vers; // Format number

    comp[comp_idx++] = gp->gflags;

    if (gp->gflags & GFLAG_MULTI_PARAM)
	comp[comp_idx++] = gp->nparam;

    if (gp->gflags & GFLAG_HAVE_STAB) {
	comp[comp_idx++] = gp->max_sel;
	comp_idx += store_array(comp+comp_idx, gp->stab, 256);
    }

    int i;
    for (i = 0; i < gp->nparam; i++)
	comp_idx += fqz_store_parameters1(&gp->p[i], comp+comp_idx);

    //fprintf(stderr, "Encoded %d bytes of param\n", comp_idx);
    return comp_idx;
}

// Choose a set of parameters based on quality statistics and
// some predefined options (selected via "strat").
static inline
int fqz_pick_parameters(fqz_gparams *gp,
			int vers,
			int strat,
			fqz_slice *s,
			unsigned char *in,
			size_t in_size) {
    //approx sqrt(delta), must be sequential
    int dsqr[] = {
	0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
	4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
	5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
	6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
    };
    uint32_t qhist[256] = {0};

    if (strat >= nstrats) strat = nstrats-1;

    // Start with 1 set of parameters.
    // FIXME: add support for multiple params later.
    memset(gp, 0, sizeof(*gp));
    gp->vers = FQZ_VERS;

    if (!(gp->p = calloc(1, sizeof(fqz_param))))
	return -1;
    gp->nparam = 1;
    gp->max_sel = 0;

    if (vers == 3) // V3.0 doesn't store qual in original orientation
	gp->gflags |= GFLAG_DO_REV;

    fqz_param *pm = gp->p;

    // Programmed strategies, which we then amend based on our
    // statistical analysis of the quality stream.
    pm->qbits  = strat_opts[strat][0];
    pm->qshift = strat_opts[strat][1];
    pm->pbits  = strat_opts[strat][2];
    pm->pshift = strat_opts[strat][3];
    pm->dbits  = strat_opts[strat][4];
    pm->dshift = strat_opts[strat][5];
    pm->qloc   = strat_opts[strat][6];
    pm->sloc   = strat_opts[strat][7];
    pm->ploc   = strat_opts[strat][8];
    pm->dloc   = strat_opts[strat][9];

    // Params for controlling behaviour here.
    pm->do_r2 = strat_opts[strat][10];
    pm->do_qa = strat_opts[strat][11];

    // Validity check input lengths and buffer size
    size_t tlen = 0, i;
    for (i = 0; i < s->num_records; i++) {
	if (tlen + s->len[i] > in_size)
	    // Oversized buffer
	    s->len[i] = in_size - tlen;
	tlen += s->len[i];
    }
    if (s->num_records > 0 && tlen < in_size)
	// Undersized buffer
	s->len[s->num_records-1] += in_size - tlen;

    // Quality metrics, for all recs
    fqz_qual_stats(s, in, in_size, pm, qhist, -1);

    pm->store_qmap = (pm->nsym <= 8 && pm->nsym*2 < pm->max_sym);

    // Check for fixed length.
    uint32_t first_len = s->len[0];
    for (i = 1; i < s->num_records; i++) {
	if (s->len[i] != first_len)
	    break;
    }
    pm->fixed_len = (i == s->num_records);
    pm->use_qtab = 0; // unused by current encoder

    if (strat >= nstrats-1)
	goto manually_set; // used in TEST_MAIN for debugging

    if (pm->pshift < 0)
	pm->pshift = MAX(0, log((double)s->len[0]/(1<<pm->pbits))/log(2)+.5);

    if (pm->nsym <= 4) {
	// NovaSeq
	pm->qshift = 2; // qmax 64, although we can store up to 256 if needed
	if (in_size < 5000000) {
	    pm->pbits =2;
	    pm->pshift=5;
	}
    } else if (pm->nsym <= 8) {
	// HiSeqX
	pm->qbits =MIN(pm->qbits,9);
	pm->qshift=3;
	if (in_size < 5000000)
	    pm->qbits =6;
    }

    if (in_size < 300000) {
	pm->qbits=pm->qshift;
	pm->dbits=2;
    }

 manually_set:
//    fprintf(stderr, "-x 0x%x%x%x%x%x%x%x%x%x%x%x%x\n",
//	    pm->qbits, pm->qshift,
//	    pm->pbits, pm->pshift,
//	    pm->dbits, pm->dshift,
//	    pm->qloc, pm->sloc, pm->ploc, pm->dloc,
//	    pm->do_r2, pm->do_qa);

    for (i = 0; i < sizeof(dsqr)/sizeof(*dsqr); i++)
	if (dsqr[i] > (1<<pm->dbits)-1)
	    dsqr[i] = (1<<pm->dbits)-1;

    if (pm->store_qmap) {
	int j;
	for (i = j = 0; i < 256; i++)
	    if (qhist[i])
		pm->qmap[i] = j++;
	    else
		pm->qmap[i] = INT_MAX;
	pm->max_sym = pm->nsym;
    } else {
	pm->nsym = 255;
	for (i = 0; i < 256; i++)
	    pm->qmap[i] = i;
    }
    if (gp->max_sym < pm->max_sym)
	gp->max_sym = pm->max_sym;

    // Produce ptab from pshift.
    if (pm->qbits) {
	for (i = 0; i < 256; i++) {
	    pm->qtab[i] = i; // 1:1

	    // Alternative mappings:
	    //qtab[i] = i > 30 ? MIN(max_sym,i)-15 : i/2;  // eg for 9827 BAM
	}

    }
    pm->qmask = (1<<pm->qbits)-1;

    if (pm->pbits) {
	for (i = 0; i < 1024; i++)
	    pm->ptab[i] = MIN((1<<pm->pbits)-1, i>>pm->pshift);

	// Alternatively via analysis of quality distributions we
	// may select a bunch of positions that are special and
	// have a non-uniform ptab[].
	// Manual experimentation on a NovaSeq run saved 2.8% here.
    }

    if (pm->dbits) {
	for (i = 0; i < 256; i++)
	    pm->dtab[i] = dsqr[MIN(sizeof(dsqr)/sizeof(*dsqr)-1, i>>pm->dshift)];
    }

    pm->use_ptab = (pm->pbits > 0);
    pm->use_dtab = (pm->dbits > 0);

    pm->pflags =
	(pm->use_qtab   ?PFLAG_HAVE_QTAB :0)|
	(pm->use_dtab   ?PFLAG_HAVE_DTAB :0)|
	(pm->use_ptab   ?PFLAG_HAVE_PTAB :0)|
	(pm->do_sel     ?PFLAG_DO_SEL    :0)|
	(pm->fixed_len  ?PFLAG_DO_LEN    :0)|
	(pm->do_dedup   ?PFLAG_DO_DEDUP  :0)|
	(pm->store_qmap ?PFLAG_HAVE_QMAP :0);

    gp->max_sel = 0;
    if (pm->do_sel) {
	// 2 selectors values, but 1 parameter block.
	// We'll use the sloc instead to encode the selector bits into
	// the context.
	gp->max_sel = 1; // indicator to check recs
	gp->gflags |= GFLAG_HAVE_STAB;
	// NB: stab is already all zero
    }

    if (gp->max_sel) {
	int max = 0;
	for (i = 0; i < s->num_records; i++) {
	    if (max < (s->flags[i] >> 16))
		max = (s->flags[i] >> 16);
	}
	gp->max_sel = max;
    }

    return 0;
}

static void fqz_free_parameters(fqz_gparams *gp) {
    if (gp && gp->p) free(gp->p);
}

static
unsigned char *compress_block_fqz2f(int vers,
				    int strat,
				    fqz_slice *s,
				    unsigned char *in,
				    size_t in_size,
				    size_t *out_size,
				    fqz_gparams *gp) {
    fqz_gparams local_gp;
    int free_params = 0;

    unsigned int last = 0;
    size_t i, j;
    ssize_t rec = 0;

    int last_len = 0;
    int comp_idx = 0;
    RangeCoder rc;

    unsigned char *comp = (unsigned char *)malloc(in_size*1.1+100000);
    unsigned char *compe = comp + (size_t)(in_size*1.1+100000);
    if (!comp)
	return NULL;

    // Pick and store params
    if (!gp) {
	gp = &local_gp;
	if (fqz_pick_parameters(gp, vers, strat, s, in, in_size) < 0)
	    return NULL;
	free_params = 1;
    }

    //dump_params(gp);
    comp_idx = var_put_u32(comp, compe, in_size);
    comp_idx += fqz_store_parameters(gp, comp+comp_idx);

    fqz_param *pm;

    // Optimise tables to remove shifts in loop (NB: cannot do this in next vers)
    for (j = 0; j < gp->nparam; j++) {
	pm = &gp->p[j];

	for (i = 0; i < 1024; i++)
	    pm->ptab[i] <<= pm->ploc;

	for (i = 0; i < 256; i++)
	    pm->dtab[i] <<= pm->dloc;
    }

    // Create models and initialise range coder
    fqz_model model;
    if (fqz_create_models(&model, gp) < 0)
	return NULL;

    RC_SetOutput(&rc, (char *)comp+comp_idx);
    RC_StartEncode(&rc);

    // For CRAM3.1, reverse upfront if needed
    pm = &gp->p[0];
    if (gp->gflags & GFLAG_DO_REV) {
	i = rec = j = 0;
	while (i < in_size) {
	    int len = rec < s->num_records-1
		? s->len[rec] : in_size - i;

	    if (s->flags[rec] & FQZ_FREVERSE) {
		// Reverse complement sequence - note: modifies buffer
		int I,J;
		unsigned char *cp = in+i;
		for (I = 0, J = len-1; I < J; I++, J--) {
		    unsigned char c;
		    c = cp[I];
		    cp[I] = cp[J];
		    cp[J] = c;
		}
	    }

	    i += len;
	    rec++;
	}
	rec = 0;
    }

    fqz_state state = {0};
    pm = &gp->p[0];
    state.p = 0;
    state.first_len = 1;
    int x;

    for (i = 0; i < in_size; i++) {
	if (state.p == 0) {
	    if (pm->do_sel || (gp->gflags & GFLAG_MULTI_PARAM)) {
		state.s = rec < s->num_records
		    ? s->flags[rec] >> 16 // reuse spare bits
		    : 0;
		SIMPLE_MODEL(256,_encodeSymbol)(&model.sel, &rc, state.s);
		//fprintf(stderr, "State %d\n", state.s);
	    } else {
		state.s = 0;
	    }
	    x = (gp->gflags & GFLAG_HAVE_STAB) ? gp->stab[state.s] : state.s;
	    pm = &gp->p[x];

	    //fprintf(stderr, "sel %d param %d\n", state.s, x);

	    int len = s->len[rec];
	    if (!pm->fixed_len || state.first_len) {
		SIMPLE_MODEL(256,_encodeSymbol)(&model.len[0], &rc, (len>> 0) & 0xff);
		SIMPLE_MODEL(256,_encodeSymbol)(&model.len[1], &rc, (len>> 8) & 0xff);
		SIMPLE_MODEL(256,_encodeSymbol)(&model.len[2], &rc, (len>>16) & 0xff);
		SIMPLE_MODEL(256,_encodeSymbol)(&model.len[3], &rc, (len>>24) & 0xff);
		//fprintf(stderr, "Len %d\n", len);
		state.first_len = 0;
	    }

	    if (gp->gflags & GFLAG_DO_REV) {
		// no need to reverse complement for V4.0 as the core format
		// already has this feature.
		if (s->flags[rec] & FQZ_FREVERSE)
		    SIMPLE_MODEL(2,_encodeSymbol)(&model.revcomp, &rc, 1);
		else
		    SIMPLE_MODEL(2,_encodeSymbol)(&model.revcomp, &rc, 0);
		//fprintf(stderr, "Rev %d\n", (s->flags[rec] & FQZ_FREVERSE) ? 1 : 0);
	    }

	    rec++;

	    state.qtot = 0;
	    state.qlen = 0;

	    state.p = len;
	    state.add_d = 0;
	    state.delta = 0;
	    state.qctx = 0;
	    state.prevq = 0;

	    last = pm->context;

	    if (pm->do_dedup) {
		// Possible dup of previous read?
		if (i && len == last_len && !memcmp(in+i-last_len, in+i, len)) {
		    SIMPLE_MODEL(2,_encodeSymbol)(&model.dup, &rc, 1);
		    i += len-1;
		    state.p = 0;
		    //fprintf(stderr, "Dup 1\n");
		    continue;
		} else {
		    SIMPLE_MODEL(2,_encodeSymbol)(&model.dup, &rc, 0);
		    //fprintf(stderr, "Dup 0\n");
		}

		last_len = len;
	    }
	}

	unsigned char q = in[i];
	unsigned char qm = pm->qmap[q];

	SIMPLE_MODEL(QMAX,_encodeSymbol)(&model.qual[last], &rc, qm);
	//fprintf(stderr, "Sym %d with ctx %04x delta %d prevq %d q %d\n", qm, last, state.delta, state.prevq, qm);
	//fprintf(stderr, "pos=%d, delta=%d\n", state.p, state.delta);
	last = fqz_update_ctx(pm, &state, qm);
    }

    RC_FinishEncode(&rc);

    // For CRAM3.1, undo our earlier reversal step
    if (gp->gflags & GFLAG_DO_REV) {
	i = rec = j = 0;
	while (i < in_size) {
	    int len = rec < s->num_records-1
		? s->len[rec]
		: in_size - i;

	    if (s->flags[rec] & FQZ_FREVERSE) {
		// Reverse complement sequence - note: modifies buffer
		int I,J;
		unsigned char *cp = in+i;
		for (I = 0, J = len-1; I < J; I++, J--) {
		    unsigned char c;
		    c = cp[I];
		    cp[I] = cp[J];
		    cp[J] = c;
		}
	    }

	    i += len;
	    rec++;
	}
    }

    // Clear selector abuse of flags
    for (rec = 0; rec < s->num_records; rec++)
	s->flags[rec] &= 0xffff;

    *out_size = comp_idx + RC_OutSize(&rc);
    //fprintf(stderr, "%d -> %d\n", (int)in_size, (int)*out_size);

    fqz_destroy_models(&model);
    if (free_params)
	fqz_free_parameters(gp);

    return comp;
}

// Read fqz paramaters.
//
// FIXME: pass in and check in_size.
//
// Returns number of bytes read on success,
//         -1 on failure.
static inline
int fqz_read_parameters1(fqz_param *pm, unsigned char *in, size_t in_size) {
    int in_idx = 0;
    size_t i;

    if (in_size < 7)
	return -1;

    // Starting context
    pm->context = in[in_idx] + (in[in_idx+1]<<8);
    in_idx += 2;

    // Bit flags
    pm->pflags     = in[in_idx++];
    pm->use_qtab   = pm->pflags & PFLAG_HAVE_QTAB;
    pm->use_dtab   = pm->pflags & PFLAG_HAVE_DTAB;
    pm->use_ptab   = pm->pflags & PFLAG_HAVE_PTAB;
    pm->do_sel     = pm->pflags & PFLAG_DO_SEL;
    pm->fixed_len  = pm->pflags & PFLAG_DO_LEN;
    pm->do_dedup   = pm->pflags & PFLAG_DO_DEDUP;
    pm->store_qmap = pm->pflags & PFLAG_HAVE_QMAP;
    pm->max_sym    = in[in_idx++];

    // Sub-context sizes and locations
    pm->qbits      = in[in_idx]>>4;
    pm->qmask      = (1<<pm->qbits)-1;
    pm->qshift     = in[in_idx++]&15;
    pm->qloc       = in[in_idx]>>4;
    pm->sloc       = in[in_idx++]&15;
    pm->ploc       = in[in_idx]>>4;
    pm->dloc       = in[in_idx++]&15;

    // Maps and tables
    if (pm->store_qmap) {
	for (i = 0; i < 256; i++) pm->qmap[i] = INT_MAX; // so dump_map works
	if (in_idx + pm->max_sym > in_size)
	    return -1;
	for (i = 0; i < pm->max_sym; i++)
	    pm->qmap[i] = in[in_idx++];
    } else {
	for (i = 0; i < 256; i++)
	    pm->qmap[i] = i;
    }

    if (pm->qbits) {
	if (pm->use_qtab)
	    in_idx += read_array(in+in_idx, in_size-in_idx, pm->qtab, 256);
	else
	    for (i = 0; i < 256; i++)
		pm->qtab[i] = i;
    }

    if (pm->use_ptab)
	in_idx += read_array(in+in_idx, in_size-in_idx, pm->ptab, 1024);
    else
	for (i = 0; i < 1024; i++)
	    pm->ptab[i] = 0;

    if (pm->use_dtab)
        in_idx += read_array(in+in_idx, in_size-in_idx, pm->dtab, 256);
    else
	for (i = 0; i < 256; i++)
	    pm->dtab[i] = 0;

    return in_idx;
}

static
int fqz_read_parameters(fqz_gparams *gp, unsigned char *in, size_t in_size) {
    int in_idx = 0;
    int i;

    if (in_size < 10)
	return -1;

    // Format version
    gp->vers = in[in_idx++];
    if (gp->vers != FQZ_VERS)
	return -1;

    // Global glags
    gp->gflags = in[in_idx++];

    // Number of param blocks and param selector details
    gp->nparam = (gp->gflags & GFLAG_MULTI_PARAM) ? in[in_idx++] : 1;
    if (gp->nparam <= 0)
	return -1;
    gp->max_sel = gp->nparam > 1 ? gp->nparam : 0;

    if (gp->gflags & GFLAG_HAVE_STAB) {
	gp->max_sel = in[in_idx++];
	in_idx += read_array(in+in_idx, in_size-in_idx, gp->stab, 256);
    } else {
	for (i = 0; i < gp->nparam; i++)
	    gp->stab[i] = i;
	for (; i < 256; i++)
	    gp->stab[i] = gp->nparam-1;
    }

    // Load the individual parameter locks
    if (!(gp->p = malloc(gp->nparam * sizeof(*gp->p))))
	return -1;

    gp->max_sym = 0;
    for (i = 0; i < gp->nparam; i++) {
	int e = fqz_read_parameters1(&gp->p[i], in + in_idx, in_size-in_idx);
	if (e < 0)
	    goto err;
	in_idx += e;

	if (gp->max_sym < gp->p[i].max_sym)
	    gp->max_sym = gp->p[i].max_sym;
    }

    //fprintf(stderr, "Decoded %d bytes of param\n", in_idx);
    return in_idx;

 err:
    fqz_free_parameters(gp);
    gp->nparam = 0;
    return -1;
}

static
unsigned char *uncompress_block_fqz2f(fqz_slice *s,
				      unsigned char *in,
				      size_t in_size,
				      size_t *out_size,
				      int *lengths,
				      int nlengths) {
    fqz_gparams gp;
    fqz_param *pm;
    char *rev_a = NULL;
    int *len_a = NULL;
    memset(&gp, 0, sizeof(gp));

    uint32_t len;
    ssize_t i, rec = 0, in_idx;
    in_idx = var_get_u32(in, in+in_size, &len);
    *out_size = len;

#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
    if (len > 100000)
	return NULL;
#endif

    unsigned char *uncomp = NULL;
    RangeCoder rc;
    unsigned int last = 0;

    // Decode parameter blocks
    if ((i = fqz_read_parameters(&gp, in+in_idx, in_size-in_idx)) < 0)
	return NULL;
    //dump_params(&gp);
    in_idx += i;

    // Optimisations to remove shifts from main loop
    for (i = 0; i < gp.nparam; i++) {
	int j;
	pm = &gp.p[i];
	for (j = 0; j < 1024; j++)
	    pm->ptab[j] <<= pm->ploc;
	for (j = 0; j < 256; j++)
	    pm->dtab[j] <<= pm->dloc;
    }

    // Initialise models and entropy coder
    fqz_model model;
    if (fqz_create_models(&model, &gp) < 0)
	return NULL;

    RC_SetInput(&rc, (char *)in+in_idx, (char *)in+in_size);
    RC_StartDecode(&rc);


    // Allocate buffers
    uncomp = (unsigned char *)malloc(*out_size);
    if (!uncomp)
	goto err;

    int nrec = 1000;
    rev_a = malloc(nrec);
    len_a = malloc(nrec * sizeof(int));
    if (!rev_a || !len_a)
	goto err;

    // Main decode loop
    fqz_state state;
    state.delta = 0;
    state.prevq = 0;
    state.qctx = 0;
    state.p = 0;
    state.s = 0;
    state.first_len = 1;

    int rev = 0;
    int last_len = 0;
    int x = 0;
    pm = &gp.p[x];
    for (rec = i = 0; i < len; i++) {
	if (rec >= nrec) {
	    nrec *= 2;
	    rev_a = realloc(rev_a, nrec);
	    len_a = realloc(len_a, nrec*sizeof(int));
	    if (!rev_a || !len_a)
		goto err;
	}

	if (state.p == 0) {
	    // New record
	    if (pm->do_sel) {
		state.s = SIMPLE_MODEL(256,_decodeSymbol)(&model.sel, &rc);
		//fprintf(stderr, "State %d\n", state.s);
	    } else {
		state.s = 0;
	    }
	    x = (gp.gflags & GFLAG_HAVE_STAB) ? gp.stab[MIN(255, state.s)] : state.s;
	    if (x >= gp.nparam)
		goto err;
	    pm = &gp.p[x];

	    unsigned int len = last_len;
	    if (!pm->fixed_len || state.first_len) {
		len  = SIMPLE_MODEL(256,_decodeSymbol)(&model.len[0], &rc);
		len |= SIMPLE_MODEL(256,_decodeSymbol)(&model.len[1], &rc)<<8;
		len |= SIMPLE_MODEL(256,_decodeSymbol)(&model.len[2], &rc)<<16;
		len |= ((unsigned)SIMPLE_MODEL(256,_decodeSymbol)(&model.len[3], &rc))<<24;
		//fprintf(stderr, "Len %d\n", len);
		state.first_len = 0;
		last_len = len;
	    }
	    if (len > *out_size-i || len <= 0)
		goto err;

	    if (lengths && rec < nlengths)
		lengths[rec] = len;

	    if (gp.gflags & GFLAG_DO_REV) {
		rev = SIMPLE_MODEL(2,_decodeSymbol)(&model.revcomp, &rc);
		//fprintf(stderr, "rev %d\n", rev);
		rev_a[rec] = rev;
		len_a[rec] = len;
	    }

	    if (pm->do_dedup) {
		if (SIMPLE_MODEL(2,_decodeSymbol)(&model.dup, &rc)) {
		    //fprintf(stderr, "Dup 1\n");
		    // Dup of last line
		    if (len > i)
			goto err;
		    memcpy(uncomp+i, uncomp+i-len, len);
		    i += len-1;
		    state.p = 0;
		    rec++;
		    continue;
		} else {
		    //fprintf(stderr, "Dup 0\n");
		}
	    }

	    rec++;

	    state.p = len;
	    state.add_d = 0;
	    state.delta = 0;
	    state.prevq = 0;
	    state.qctx = 0;

	    last = pm->context;
	}

	// Decode and output quality
	unsigned char Q = SIMPLE_MODEL(QMAX,_decodeSymbol)(&model.qual[last], &rc);
	unsigned char q = pm->qmap[Q];
	//fprintf(stderr, "Sym %d with ctx %04x delta %d prevq %d q %d\n", Q, last, state.delta, state.prevq, Q);
        uncomp[i] = q;

	// Compute new quality context
	last = fqz_update_ctx(pm, &state, Q);
    }

    if (rec >= nrec) {
	nrec *= 2;
	rev_a = realloc(rev_a, nrec);
	len_a = realloc(len_a, nrec*sizeof(int));
	if (!rev_a || !len_a)
	    goto err;
    }
    rev_a[rec] = rev;
    len_a[rec] = len;

    if (gp.gflags & GFLAG_DO_REV) {
	for (i = rec = 0; i < len && rec < nrec; i += len_a[rec++]) {
	    if (!rev_a[rec])
		continue;

	    int I, J;
	    unsigned char *cp = uncomp+i;
	    for (I = 0, J = len_a[rec]-1; I < J; I++, J--) {
		unsigned char c;
		c = cp[I];
		cp[I] = cp[J];
		cp[J] = c;
	    }
	}
    }

    RC_FinishDecode(&rc);
    fqz_destroy_models(&model);
    //free(model.qual);
    free(rev_a);
    free(len_a);
    fqz_free_parameters(&gp);

#ifdef TEST_MAIN
    s->num_records = rec;
#endif

    return uncomp;

 err:
    fqz_destroy_models(&model);
    free(rev_a);
    free(len_a);
    fqz_free_parameters(&gp);
    free(uncomp);

    return NULL;
}

char *fqz_compress(int vers, fqz_slice *s, char *in, size_t uncomp_size,
		   size_t *comp_size, int strat, fqz_gparams *gp) {
    return (char *)compress_block_fqz2f(vers, strat, s, (unsigned char *)in,
					uncomp_size, comp_size, gp);
}

char *fqz_decompress(char *in, size_t comp_size, size_t *uncomp_size,
		     int *lengths, int nlengths) {
    return (char *)uncompress_block_fqz2f(NULL, (unsigned char *)in,
					  comp_size, uncomp_size, lengths, nlengths);
}
